####DataScreening Script. Class2####
##load data##
#set your working directory to the path where your input files are found
setwd('~/Documents/Macs folder UW/courses/FISH 560 multivariate/course materials')
#to read in biostats I went to file>source file biostats
#to read in libraries I need for this exercise
library(vegan)
library(permute)
library(pastecs)
library(boot)
#read in data
envdata<-read.csv('env.csv',header=TRUE,row.names=1)
spedata<-read.csv('abund.csv',header=TRUE,row.names=1)
#make sure file read in ok
head(spedata)
#calc summary stats for envdata and spe data
stat.desc(envdata)
stat.desc(spedata)
#create new objects with column1 removed (it's non-numeric so it's mad, this will make it easier than putting in the exception every time)
spedata<-spedata[,-1]
##evaluate missing data##
#evaluate missing data, replace missing data with the median
testdata<-replace.missing(spedata)
#replace missing data with mean instead
testdata<-replace.missing(spedata,method='mean')
# drop a variable if >5% of observations are missing
testdata<-drop.var(spedata,pct.missing=5)
##examine freq of occurance and abundance graphs##
#frequence of abundance; 10 graphs total
foa.plots(spedata)
##screen variables for 'sufficiency' and eliminate if necessary
#drop rare species with non-zero values in less than 5% of sites
testdata<-drop.var(spedata,min.po=5)
#drop rare species with fewer than 5 non-zero values
testdata<-drop.var(spedata,min.fo=5)
#drop abundant generalist species, those in >95% of plots
testdata<-drop.var(spedata,max.po=95)
#drop  variables with too little variation, those w/ cv<5%
testdata<-drop.var(spedata,min.cv=5)
##Examine distributional properties of the data##
#ECDF=empiricle cumulative distribution functions (here a variable with uniform distribution will have perfect diagonal stright line, deviations indicate non-uniformity)
ecdf.plots(spedata)
#conventional histogram
hist.plots(spedata)
#box and whisker plot
box.plots(spedata)
#normal quantile-quantile plots
qqnorm.plots(spedata)
#those plots above can all be depicted TOGETHER
uv.plots(spedata)
##Evaluate the need for data transformation##
#tranform data to log(base 10)
data.trans(spedata,method='log')
#transform data 'power transformation' (including binary transformation) this is a square root transformation
data.trans(spedata,method='power',exp=0.5)
#to transform into presence/absence (i.e. binary transformation), use power method with an exponent equto to zero
data.trans(spedata,method='power',exp=0)
#transform data using arcsine square root transformation, this is for PROPORTIONAL data (i.e. ranges 0-1)
data.trans(spedata,method='asin')
#ONCE you've decided on TRANSFORMATION, save the transformed data as a new object.  Here are 2 examples of saving a natural log transformation
tradata<-data.trans(spedata,method='log',plot=F)
#note this next one does the same as one above, but need to add the +1 whereas the data.trans function includes it automatically
tradata<-log10(spedata+1)
##Evaluate the Need for Data Standardization##
#standardization adjust for scale differences between variables (e.g. flow=1-10,000, temp=1-30) also can adjust for differences between individuals or site (row standardization)
#to decide if you want to standardize, check the cvs for the variables(by column and row) - this was done previously, but here is command again:
stat.desc(spedata)
#here is an example of a row normalization (i.e. rescale each row so that sums equal 1)
data.stand(spedata,method='total',margin='row')
##SAVE A TRANSFORMED DATASET AS AN OUTFILE##
#as an alternative, you can just rename it as a new object and do analyses from there
#here I have it writing the object: tradata into a file named tradata, separated by commas, missing values written as NA and keeping row names
write.table(tradata,file='tradata',sep=',',na='NA',row.names=TRUE)
##EVALUTATING THE DATASET FOR OUTLIERS##
#uv (univariate) computes the standardized values (i.e. z-scores) and returns a data frame with a list of samples and variables that are greater than a specified # of standard deviations (usually 3, but 1 in this example from illustrative purposes)
uv.outliers(envdata, id='Sinuosity:BasinAre', var='Elev', sd.limit=1)
#mv (multivariate) outliers based on eulidean distance same note here(usually 3 sd, but 1 in this example from illustrative purposes)
mv.outliers(envdata,method='euclidean', sd.limit=1)
#mv, another way is to use Mahalanobis distance ..uuhhh
mv.outliers(envdata,method='mahalanobis',sd.limit=1)
##Exercise##
#need my own data first then
#1. summary stats
stat.desc(envdata)
#2a. do outliers exist?
mv.outliers(envdata,method='euclidean',sd.limit=3)
#2b.